Introduction

This is code for Section 17.2 of Kruschke using a Student t noise distribution. Other than that change, this document is very similar to the one from section 7.1.

Make sure that the two files “HtWtData30.csv” and “HtWtData300.csv” are in your project directory.

Preliminaries

Load necessary packages:

library(tidyverse)
library(rstan)
library(tidybayes)
library(bayesplot)

Set Stan to save compiled code.

rstan_options(auto_write = TRUE)

Set Stan to use parallel processing where possible.

options(mc.cores = parallel::detectCores())

Data

HtWtData30 <- read_csv("HtWtData30.csv")
Rows: 30 Columns: 3── Column specification ─────────────────────────────────────────
Delimiter: ","
dbl (3): male, height, weight
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
HtWtData30
ggplot(HtWtData30, aes(y = weight, x = height)) +
    geom_point()

HtWtData300 <- read_csv("HtWtData300.csv")
Rows: 300 Columns: 3── Column specification ─────────────────────────────────────────
Delimiter: ","
dbl (3): male, height, weight
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
HtWtData300
ggplot(HtWtData300, aes(y = weight, x = height)) +
    geom_point()

Mean center the x variable to get an interpretable intercept. (Kruschke standardizes both variables using z-scores, but that makes the coefficients of the model harder to interpret.)

HtWtData30 <- HtWtData30 %>%
    mutate(height_mc = height - mean(height))
HtWtData30
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point()

HtWtData300 <- HtWtData300 %>%
    mutate(height_mc = height - mean(height))
HtWtData300
ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point()

We’re going to run two different examples, so we’ll make two lists.

stan_data_30 <- list(N = NROW(HtWtData30),
                     y = HtWtData30$weight,
                     x = HtWtData30$height_mc)
stan_data_300 <- list(N = NROW(HtWtData300),
                      y = HtWtData300$weight,
                      x = HtWtData300$height_mc)

Prior

Simulation code

data {
    int<lower = 0> N;
    vector<lower = 0>[N] y;
    vector[N] x;
}
generated quantities {
    real nu;
    real beta0;
    real beta1;
    vector[N] mu;
    real<lower = 0> sigma;
    real y_sim[N];
    
    nu = gamma_rng(2, 0.1);       // default prior for df
    beta0 = normal_rng(150, 25);  // Intercept between 100 and 200
    beta1 = normal_rng(0, 5);    // -10 to 10 lbs per inch
    mu = beta0 + beta1 * x;       // linear model
    sigma = exponential_rng(1.0/100.0); // sd of 100 lbs
    y_sim = student_t_rng(nu, mu, sigma);
}
fit_HtWt30_robust_prior <- sampling(HtWt_robust_prior,
                                    data = stan_data_30,
                                    chains = 1,
                                    algorithm = "Fixed_param",
                                    seed = 11111,
                                    refresh = 0)
samples_HtWt30_robust_prior <- tidy_draws(fit_HtWt30_robust_prior)
samples_HtWt30_robust_prior

Examine prior

mcmc_hist(samples_HtWt30_robust_prior,
          pars = c("nu", "sigma", "beta0", "beta1"))

mcmc_pairs(fit_HtWt30_robust_prior,
           pars = c("nu", "sigma", "beta0", "beta1"))
Warning: Only one chain in 'x'. This plot is more useful with multiple chains.

ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust_prior,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.05)

Prior predictive distribution

y_sim_HtWt30_prior <- samples_HtWt30_robust_prior %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
ppd_intervals(ypred = y_sim_HtWt30_prior,
              x = HtWtData30$height_mc)

Model

data {
    int<lower = 0> N;
    vector<lower = 0>[N] y;
    vector[N] x;
}
parameters {
    real nu;
    real beta0;
    real beta1;
    real<lower = 0> sigma;
}
transformed parameters {
    vector[N] mu;
    mu = beta0 + beta1 * x;   // linear model
}
model {
    nu ~ gamma(2, 0.1);       // default prior for df
    beta0 ~ normal(150, 25);  // Intercept between 100 and 200
    beta1 ~ normal(0, 5);    // -10 to 10 lbs per inch
    sigma ~ exponential(1.0/100.0); // sd of 100 lbs
    y ~ student_t(nu, mu, sigma);
}
generated quantities {
    real y_sim[N];
    y_sim = student_t_rng(nu, mu, sigma);
}
fit_HtWt30_robust <- sampling(HtWt_robust,
                              data = stan_data_30,
                              seed = 11111,
                              refresh = 0)
Warning: There were 615 divergent transitions after warmup. See
https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
to find out why this is a problem and how to eliminate them.Warning: Examine the pairs() plot to diagnose sampling problems
samples_HtWt30_robust <- tidy_draws(fit_HtWt30_robust)
samples_HtWt30_robust

Model diagnostics

stan_trace(fit_HtWt30_robust,
           pars = c("nu", "sigma", "beta0", "beta1"))

mcmc_acf(fit_HtWt30_robust,
         pars = c("nu", "sigma", "beta0", "beta1"))

Model summary

print(fit_HtWt30_robust,
      pars = c("nu", "beta0", "beta1", "sigma"))
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

        mean se_mean    sd   2.5%    25%    50%    75%  97.5%
nu     18.37    0.45 12.53   3.55   9.07  15.20  24.36  49.69
beta0 152.76    0.11  4.86 143.53 149.55 152.67 155.93 162.65
beta1   4.25    0.03  1.23   1.79   3.48   4.26   5.07   6.77
sigma  25.74    0.13  4.37  18.22  22.69  25.30  28.35  35.36
      n_eff Rhat
nu      776 1.01
beta0  1796 1.00
beta1  1399 1.00
sigma  1075 1.00

Samples were drawn using NUTS(diag_e) at Sat May 13 23:49:01 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).

Model visualization

mcmc_areas(fit_HtWt30_robust, pars = "nu")

mcmc_areas(fit_HtWt30_robust, pars ="sigma")

mcmc_areas(fit_HtWt30_robust, pars ="beta0")

mcmc_areas(fit_HtWt30_robust, pars = "beta1")

pairs(fit_HtWt30_robust,
      pars = c("nu", "sigma", "beta0", "beta1"))

Posterior predictive check

y_sim_HtWt30_robust <- samples_HtWt30_robust %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2)

Compare to standard linear regression.

ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2) +
    geom_smooth(method = lm, color = "red", linewidth = 2)

ppc_intervals(y = HtWtData30$weight,
              x = HtWtData30$height_mc,
              yrep = y_sim_HtWt30_robust,
              prob = 0.68,
              prob_outer = 0.95)

ppc_hist(y = HtWtData30$weight,
         yrep = y_sim_HtWt30_robust[1:19, ])

ppc_boxplot(y = HtWtData30$weight,
            yrep = y_sim_HtWt30_robust[1:10, ],
            notch = FALSE)

ppc_dens_overlay(y = HtWtData30$weight,
                 yrep = y_sim_HtWt30_robust[1:50, ])

ppc_stat_2d(y = HtWtData30$weight,
            yrep = y_sim_HtWt30_robust)

The model with 300 observations

fit_HtWt300_robust <- sampling(HtWt_robust,
                               data = stan_data_300,
                               seed = 11111,
                               refresh = 0)
samples_HtWt300_robust <- tidy_draws(fit_HtWt300_robust)
samples_HtWt300_robust

Model diagnostics

stan_trace(fit_HtWt300_robust,
           pars = c("nu", "sigma", "beta0", "beta1"))

mcmc_acf(fit_HtWt300_robust,
         pars = c("nu", "sigma", "beta0", "beta1"))

Model summary

print(fit_HtWt300_robust,
      pars = c("nu", "beta0", "beta1", "sigma"))
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

        mean se_mean   sd   2.5%    25%    50%    75%  97.5%
nu      5.67    0.05 1.77   3.26   4.44   5.32   6.49  10.22
beta0 156.32    0.03 1.68 153.09 155.19 156.28 157.45 159.73
beta1   4.43    0.01 0.40   3.62   4.15   4.43   4.71   5.20
sigma  24.16    0.04 1.68  20.94  22.99  24.14  25.31  27.51
      n_eff Rhat
nu     1324    1
beta0  2508    1
beta1  3408    1
sigma  1766    1

Samples were drawn using NUTS(diag_e) at Sat May 13 23:49:57 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).

Model visualization

mcmc_areas(fit_HtWt300_robust, pars = "nu")

mcmc_areas(fit_HtWt300_robust, pars ="sigma")

mcmc_areas(fit_HtWt300_robust, pars ="beta0")

mcmc_areas(fit_HtWt300_robust, pars = "beta1")

pairs(fit_HtWt300_robust,
      pars = c("nu", "sigma", "beta0", "beta1"))

Posterior predictive check

y_sim_HtWt300_robust <- samples_HtWt300_robust %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2)

Compare to standard linear regression.

ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2) +
    geom_smooth(method = lm, color = "red", linewidth = 2)

ppc_intervals(y = HtWtData300$weight,
              x = HtWtData300$height_mc,
              yrep = y_sim_HtWt300_robust,
              prob = 0.68,
              prob_outer = 0.95)

ppc_hist(y = HtWtData300$weight,
         yrep = y_sim_HtWt300_robust[1:19, ])

ppc_boxplot(y = HtWtData300$weight,
            yrep = y_sim_HtWt300_robust[1:10, ],
            notch = FALSE)

ppc_dens_overlay(y = HtWtData300$weight,
                 yrep = y_sim_HtWt300_robust[1:50, ])

ppc_stat_2d(y = HtWtData300$weight,
            yrep = y_sim_HtWt300_robust)

---
title: "17.2: Robust regression"
output: html_notebook
---


## Introduction

This is code for Section 17.2 of Kruschke using a Student t noise distribution. Other than that change, this document is very similar to the one from section 7.1.

Make sure that the two files "HtWtData30.csv" and "HtWtData300.csv" are in your project directory.


## Preliminaries

Load necessary packages:

```{r}
library(tidyverse)
library(rstan)
library(tidybayes)
library(bayesplot)
```

Set Stan to save compiled code.

```{r}
rstan_options(auto_write = TRUE)
```

Set Stan to use parallel processing where possible.

```{r}
options(mc.cores = parallel::detectCores())
```


## Data

```{r}
HtWtData30 <- read_csv("HtWtData30.csv")
HtWtData30
```

```{r}
ggplot(HtWtData30, aes(y = weight, x = height)) +
    geom_point()
```

```{r}
HtWtData300 <- read_csv("HtWtData300.csv")
HtWtData300
```

```{r}
ggplot(HtWtData300, aes(y = weight, x = height)) +
    geom_point()
```

Mean center the x variable to get an interpretable intercept. (Kruschke standardizes both variables using z-scores, but that makes the coefficients of the model harder to interpret.)

```{r}
HtWtData30 <- HtWtData30 %>%
    mutate(height_mc = height - mean(height))
HtWtData30
```

```{r}
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point()
```


```{r}
HtWtData300 <- HtWtData300 %>%
    mutate(height_mc = height - mean(height))
HtWtData300
```

```{r}
ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point()
```

We're going to run two different examples, so we'll make two lists.

```{r}
stan_data_30 <- list(N = NROW(HtWtData30),
                     y = HtWtData30$weight,
                     x = HtWtData30$height_mc)
stan_data_300 <- list(N = NROW(HtWtData300),
                      y = HtWtData300$weight,
                      x = HtWtData300$height_mc)
```


## Prior

### Simulation code

```{stan, output.var = "HtWt_robust_prior", cache = TRUE}
data {
    int<lower = 0> N;
    vector<lower = 0>[N] y;
    vector[N] x;
}
generated quantities {
    real nu;
    real beta0;
    real beta1;
    vector[N] mu;
    real<lower = 0> sigma;
    real y_sim[N];
    
    nu = gamma_rng(2, 0.1);       // default prior for df
    beta0 = normal_rng(150, 25);  // Intercept between 100 and 200
    beta1 = normal_rng(0, 5);    // -10 to 10 lbs per inch
    mu = beta0 + beta1 * x;       // linear model
    sigma = exponential_rng(1.0/100.0); // sd of 100 lbs
    y_sim = student_t_rng(nu, mu, sigma);
}
```

```{r, cache = TRUE}
fit_HtWt30_robust_prior <- sampling(HtWt_robust_prior,
                                    data = stan_data_30,
                                    chains = 1,
                                    algorithm = "Fixed_param",
                                    seed = 11111,
                                    refresh = 0)
```

```{r}
samples_HtWt30_robust_prior <- tidy_draws(fit_HtWt30_robust_prior)
samples_HtWt30_robust_prior
```

### Examine prior

```{r}
mcmc_hist(samples_HtWt30_robust_prior,
          pars = c("nu", "sigma", "beta0", "beta1"))
```

```{r}
mcmc_pairs(fit_HtWt30_robust_prior,
           pars = c("nu", "sigma", "beta0", "beta1"))
```

```{r}
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust_prior,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.05)
```


### Prior predictive distribution

```{r}
y_sim_HtWt30_prior <- samples_HtWt30_robust_prior %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
```

```{r}
ppd_intervals(ypred = y_sim_HtWt30_prior,
              x = HtWtData30$height_mc)
```


## Model

```{stan, output.var = "HtWt_robust", cache = TRUE}
data {
    int<lower = 0> N;
    vector<lower = 0>[N] y;
    vector[N] x;
}
parameters {
    real nu;
    real beta0;
    real beta1;
    real<lower = 0> sigma;
}
transformed parameters {
    vector[N] mu;
    mu = beta0 + beta1 * x;   // linear model
}
model {
    nu ~ gamma(2, 0.1);       // default prior for df
    beta0 ~ normal(150, 25);  // Intercept between 100 and 200
    beta1 ~ normal(0, 5);    // -10 to 10 lbs per inch
    sigma ~ exponential(1.0/100.0); // sd of 100 lbs
    y ~ student_t(nu, mu, sigma);
}
generated quantities {
    real y_sim[N];
    y_sim = student_t_rng(nu, mu, sigma);
}
```

```{r, cache = TRUE}
fit_HtWt30_robust <- sampling(HtWt_robust,
                              data = stan_data_30,
                              seed = 11111,
                              refresh = 0)
```

```{r}
samples_HtWt30_robust <- tidy_draws(fit_HtWt30_robust)
samples_HtWt30_robust
```

## Model diagnostics

```{r}
stan_trace(fit_HtWt30_robust,
           pars = c("nu", "sigma", "beta0", "beta1"))
```

```{r}
mcmc_acf(fit_HtWt30_robust,
         pars = c("nu", "sigma", "beta0", "beta1"))
```


## Model summary

```{r}
print(fit_HtWt30_robust,
      pars = c("nu", "beta0", "beta1", "sigma"))
```


## Model visualization

```{r}
mcmc_areas(fit_HtWt30_robust, pars = "nu")
```

```{r}
mcmc_areas(fit_HtWt30_robust, pars ="sigma")
```

```{r}
mcmc_areas(fit_HtWt30_robust, pars ="beta0")
```

```{r}
mcmc_areas(fit_HtWt30_robust, pars = "beta1")
```

```{r}
pairs(fit_HtWt30_robust,
      pars = c("nu", "sigma", "beta0", "beta1"))
```


## Posterior predictive check

```{r}
y_sim_HtWt30_robust <- samples_HtWt30_robust %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
```

```{r}
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2)
```

Compare to standard linear regression.

```{r}
ggplot(HtWtData30, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt30_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2) +
    geom_smooth(method = lm, color = "red", linewidth = 2)
```

```{r}
ppc_intervals(y = HtWtData30$weight,
              x = HtWtData30$height_mc,
              yrep = y_sim_HtWt30_robust,
              prob = 0.68,
              prob_outer = 0.95)
```

```{r}
ppc_hist(y = HtWtData30$weight,
         yrep = y_sim_HtWt30_robust[1:19, ])
```

```{r}
ppc_boxplot(y = HtWtData30$weight,
            yrep = y_sim_HtWt30_robust[1:10, ],
            notch = FALSE)
```

```{r}
ppc_dens_overlay(y = HtWtData30$weight,
                 yrep = y_sim_HtWt30_robust[1:50, ])
```

```{r}
ppc_stat_2d(y = HtWtData30$weight,
            yrep = y_sim_HtWt30_robust)
```




## The model with 300 observations

```{r, cache = TRUE}
fit_HtWt300_robust <- sampling(HtWt_robust,
                               data = stan_data_300,
                               seed = 11111,
                               refresh = 0)
```

```{r}
samples_HtWt300_robust <- tidy_draws(fit_HtWt300_robust)
samples_HtWt300_robust
```

## Model diagnostics

```{r}
stan_trace(fit_HtWt300_robust,
           pars = c("nu", "sigma", "beta0", "beta1"))
```

```{r}
mcmc_acf(fit_HtWt300_robust,
         pars = c("nu", "sigma", "beta0", "beta1"))
```


## Model summary

```{r}
print(fit_HtWt300_robust,
      pars = c("nu", "beta0", "beta1", "sigma"))
```


## Model visualization

```{r}
mcmc_areas(fit_HtWt300_robust, pars = "nu")
```

```{r}
mcmc_areas(fit_HtWt300_robust, pars ="sigma")
```

```{r}
mcmc_areas(fit_HtWt300_robust, pars ="beta0")
```

```{r}
mcmc_areas(fit_HtWt300_robust, pars = "beta1")
```

```{r}
pairs(fit_HtWt300_robust,
      pars = c("nu", "sigma", "beta0", "beta1"))
```


## Posterior predictive check

```{r}
y_sim_HtWt300_robust <- samples_HtWt300_robust %>%
    select(starts_with("y_sim")) %>%
    as.matrix()
```

```{r}
ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2)
```

Compare to standard linear regression.

```{r}
ggplot(HtWtData300, aes(y = weight, x = height_mc)) +
    geom_point() +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = beta0, slope = beta1),
                alpha = 0.01) +
    geom_abline(data = samples_HtWt300_robust,
                aes(intercept = mean(beta0), slope = mean(beta1)),
                color = "blue", linewidth = 2) +
    geom_smooth(method = lm, color = "red", linewidth = 2)
```

```{r}
ppc_intervals(y = HtWtData300$weight,
              x = HtWtData300$height_mc,
              yrep = y_sim_HtWt300_robust,
              prob = 0.68,
              prob_outer = 0.95)
```

```{r}
ppc_hist(y = HtWtData300$weight,
         yrep = y_sim_HtWt300_robust[1:19, ])
```

```{r}
ppc_boxplot(y = HtWtData300$weight,
            yrep = y_sim_HtWt300_robust[1:10, ],
            notch = FALSE)
```

```{r}
ppc_dens_overlay(y = HtWtData300$weight,
                 yrep = y_sim_HtWt300_robust[1:50, ])
```

```{r}
ppc_stat_2d(y = HtWtData300$weight,
            yrep = y_sim_HtWt300_robust)
```